Skunkware 5

home *** CD-ROM | disk | FTP | other *** search

/ Skunkware 5 / Skunkware 5.iso / src / Tools / freeWAIS-sf-1.1 / ir / sersrch.c < prev next >

Wrap

C/C++ Source or Header | 1994-12-13 | 40.7 KB | 1,399 lines

/* Copyright (c) CNIDR (Work in progress) */ /* WIDE AREA INFORMATION SERVER SOFTWARE No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ /* implements the search part of irext.h (search_word and finished_search_word) -brewster Split from irsearch.c 5/31/91 Added scale_scores. Fixed document_score_array to long. 7/8/91 Removed scale_scores, handled in search_word with doc_id > 0. 2/4/92 Made document_score_array a double. - Jonny G * $Log: sersrch.c,v $ * Revision 1.54 1994/12/13 17:03:58 pfeifer * *** empty log message *** * * Revision 1.53 1994/11/14 15:58:17 pfeifer * Patch by Archie Warnoc in c.i.w (must be made size dependent?) * * Revision 1.52 1994/09/06 16:53:48 pfeifer * Syn cache patch * * Revision 1.51 1994/08/05 09:46:46 pfeifer * No more 'MAXINT redefined' complaints. * * Revision 1.50 1994/08/05 07:12:38 pfeifer * Release beta 04 * * Revision 1.49 1994/07/13 07:52:36 huynh1 * Uli * * Revision 1.48 1994/05/27 09:13:21 huynh1 * boolean code updated. beta * * Revision 1.47 1994/05/26 14:33:57 huynh1 * search_word updated (read_weight_from_stream). * beta. * * Revision 1.46 1994/05/20 12:49:58 pfeifer * beta * * Revision 1.45 1994/05/19 12:44:39 huynh1 * search_word updated. * * Revision 1.44 1994/05/18 17:28:13 huynh1 * new term weighting * higher retrieval quality. * * Revision 1.40 1994/04/28 16:28:01 huynh1 * stemming * * Revision 1.39 1994/04/06 23:52:04 huynh1 * 08, autoconf, Uli * * Revision 1.38 1994/03/23 13:11:07 pfeifer * removed include iso.h * * Revision 1.37 1994/03/08 20:46:12 huynh1 * Patchlevel 04 * * Revision 1.36 1994/02/14 10:33:04 huynh1 * new code for field concept added. * * Revision 1.36 1993/12/08 17:38:00 huynh1 * bug by mixing literal and nested boolean corrected! * * Revision 1.10 1993/10/13 14:14:20 huynh1 * new code added for encapsulated boolean queries and * modified literal search * * Revision 1.3 1993/07/13 08:19:56 pfeifer * Sicherung vor Aenderungen Tung * * Revision 1.1 1993/02/16 15:05:35 freewais * Initial revision * * Revision 1.24 92/04/28 16:56:54 morris * added boolean to serial engine * * Revision 1.23 92/03/15 10:15:18 jonathan * Added Simon Spero's ASSIGN replacement for read_bytes. * * Revision 1.22 92/03/05 07:09:54 shen * add two more dummy arguments to call to init_search_engine * * Revision 1.21 92/02/12 17:29:52 jonathan * Conditionalized inclusion of object code. * * Revision 1.20 92/02/12 13:40:06 jonathan * Added "$Log" so RCS will put the log message in the header * */ #include "cutil.h" #include "irfiles.h" #ifdef BIO #include "irtfiles.h" /* dgg, for wordDelimiter */ #endif #include "irsearch.h" #include "irext.h" #include "byte_order.h" /* #include <string.h> */ #include <ctype.h> #include <math.h> #ifdef MAXINT #undef MAXINT #endif #define MAXINT (unsigned long)2^(sizeof(long)*8-1) #define VALUE 1000000L /* francois */ #include "stemmer.h" /* tung, 10/93 */ #ifdef NESTED_BOOLEANS #include "boolean_op.h" #endif /* tung, 10/93 */ #ifdef FIELDS /* tung, 1/94 */ #include "field_search.h" #endif #ifdef NEW_WEIGHT /* tung, 5/94 */ #include "weight.h" #endif #ifdef BOOL #include "obj.h" #include "irparse.h" object* currentQuery = NULL; /* kludge until irext goes away */ #endif /* def BOOL */ /* weighting for relevant document terms - this may become a parameter to the query. */ #define RF_WEIGHTING 0.1 /* ================================== * === Initialization Functions === * ==================================*/ long init_search_engine(file, initialize, for_search, cm_mem_percent, text_size, grow_percent) char* file; boolean initialize; boolean for_search; long cm_mem_percent; /* unused */ long text_size; /* unused */ long grow_percent; /* unused */ { static boolean inited = false; if (inited == false) { #ifdef BOOL initObj(); initBool(); #endif inited = true; } return(0); } long finished_search_engine() { #ifdef CACHE_SYN /* clean up shared memory segments */ if (cacheSynId) { int i; char *pcs; t_cacheSyn *syn_Cache, *cs; if ((syn_Cache = (t_cacheSyn *) shmat (cacheSynId, 0, 0)) != ((t_cacheSyn *)-1)) { pcs = (char *) syn_Cache; for (i = 0, cs = (t_cacheSyn *) pcs; i < MAX_SYN_CACHE && cs->id; i++, pcs += sizeof(t_cacheSyn), cs = (t_cacheSyn *) pcs) if (shmctl(cs->id, IPC_RMID, (t_cacheSyn *)0) < 0) waislog (WLOG_HIGH, WLOG_WARNING, "Error detatching shared memory segment (id=%d)", cs->id); if (shmctl(cacheSynId, IPC_RMID, (t_cacheSyn *)0) < 0) waislog (WLOG_HIGH, WLOG_WARNING, "Error detatching shared memory segment (id=%d)", cacheSynId); } } #endif return(0); } /* * ext_open_database: see irext.h */ long ext_open_database (db, initialize, for_search) database *db; boolean initialize; boolean for_search; { /* this has to deal with the .inv file */ char file[MAX_FILE_NAME_LEN]; if(initialize) /* make a new one */ db->index_stream = s_fopen(index_filename(file, db), "w+b"); else if(for_search) /* just search */ db->index_stream = s_fopen(index_filename(file, db), "rb"); else /* write to an existing db */ db->index_stream = s_fopen(index_filename(file, db), "r+b"); if (db->index_stream == NULL) { waislog(WLOG_HIGH, WLOG_ERROR,"2can't open the inverted index file %s\n", file); disposeDatabase(db); return(1); } return(0); } /* * ext_close_database: see irext.h */ long ext_close_database (db) database *db; { return(0); } char *database_file(database_name) char *database_name; { return(database_name); } /*===========================* *=== Setting Paramters ===* *===========================*/ long max_hit_retrieved = 0; char **srcs = NULL; long set_query_parameter (mask, parameters) long mask; query_parameter_type * parameters; { switch (mask) { case SET_MAX_RETRIEVED_MASK: max_hit_retrieved = parameters->max_hit_retrieved; return(0); break; case SET_SELECT_SOURCE: if(NULL != srcs){ if(NULL != srcs[0]) s_free(srcs[0]); s_free(srcs); } srcs = parameters->srcs; break; default: return(-1); break; } return(0); } /*==============================* *=== Document Score Array ===* *==============================*/ double *document_score_array = NULL; long document_score_array_len = 0; #ifdef NESTED_BOOLEANS /* tung, 1/94 */ double *NumPart_score_array = NULL; #else #ifdef BOOLEANS double *prev_score_array = NULL; /* 12/91 GS TLG */ #endif #endif #ifdef NESTED_BOOLEANS /* tung, 10/93 */ search_result_struct *search_result_array = NULL; long operand_id = 0; static void clear_search_result_array _AP((long* number_of_elements)); static void clear_search_result_array(number_of_elements) long* number_of_elements; { long count; if(*number_of_elements > 1 && search_result_array != NULL) { for(count=0; count < *number_of_elements; count++) { if(search_result_array[count].doc_ids_array != NULL) s_free(search_result_array[count].doc_ids_array); } s_free(search_result_array); } *number_of_elements = 1; } static void make_search_result_array _AP((long length)); static void make_search_result_array(length) long length; { if(search_result_array == NULL) { search_result_array = (search_result_struct *) s_malloc((size_t)(length * sizeof(search_result_struct))); operand_id = 0; } } static boolean make_doc_ids_array _AP((long pos, long length)); static boolean make_doc_ids_array(pos, length) long pos; long length; { /* if(search_result_array[pos].doc_ids_array == NULL) */ search_result_array[pos].doc_ids_array = (doc_descr_struct *) s_malloc((size_t)(sizeof(doc_descr_struct) * length)); if(search_result_array[pos].doc_ids_array == NULL) { waislog(WLOG_HIGH, WLOG_ERROR, "Out of memory"); return(false); } return(true); } /* tung, 10/93 */ #endif /* make_document_score_array insures that the document_score_array array is long enough, if not it makes it long enough */ static void make_document_score_array _AP((long length )); static void make_document_score_array(length) long length; { if(length <= document_score_array_len) return; /* we have to make a new one. free the old one first (if any) */ if(document_score_array != 0){ s_free(document_score_array); #ifdef NESTED_BOOLEANS /* tung, 1/94 */ s_free(NumPart_score_array); #else #ifdef BOOLEANS s_free(prev_score_array); /* 12/91 GS TLG */ #endif #endif } document_score_array = (double*)s_malloc((size_t)(length * sizeof(double))); #ifdef NESTED_BOOLEANS /* tung, 1/94 */ NumPart_score_array = (double*)s_malloc((size_t)(length * sizeof(double))); memset(NumPart_score_array, 0, document_score_array_len * sizeof(double)); #else #ifdef BOOLEANS prev_score_array = (double*)s_malloc((size_t)(length * sizeof(double))); /* 12/91 GS TLG */ #endif #endif document_score_array_len = length; } static void destroy_document_score_array _AP((void)); static void destroy_document_score_array() { s_free(document_score_array); #ifdef NESTED_BOOLEANS /* tung, 1/94 */ s_free(NumPart_score_array); #else #ifdef BOOLEANS s_free(prev_score_array); /* 12/91 GS TLG */ #endif #endif document_score_array_len = 0; } void clear_document_score_array() /* side effects the document_score_array. */ { memset(document_score_array, 0, document_score_array_len * sizeof(double)); #ifdef NESTED_BOOLEANS /* tung, 1/94 */ memset(NumPart_score_array, 0, document_score_array_len * sizeof(double)); #else #ifdef BOOLEANS memset(prev_score_array, 0, /* 12/91 GS TLG */ document_score_array_len * sizeof(double)); /* 12/91 GS TLG */ #endif #endif } /* for debugging purposes */ void print_document_score_array(start,stop) unsigned long start; unsigned long stop; /* assumes start >= 0, stop < db->doc_table_allocated_entries */ { long i; for(i = start; i <= stop; i++){ printf("entry number %d: %f \n", i, document_score_array[i]); } } /*=========================* *=== Best Hits Array ===* *=========================*/ hit *best_hits_array = NULL; long best_hits_array_len = 0; long current_best_hit = 0; long doc_start = 0; /* tung, 5/94 */ long doc_end = 0; /* tung, 5/94 */ /* see irext.h for doc */ long init_best_hit (db) database *db; { #ifdef BOOL if (currentQuery != NULL) send(currentQuery,InitBestHit,db); #endif /* def BOOL */ return(0); } /* make_best_hits_array insures that the best_hits_array array is long enough, if not it makes it long enough */ static void make_best_hits_array _AP((long length)); static void make_best_hits_array(length) long length; { if(length <= best_hits_array_len) return; /* we have to make a new one. free the old one first (if any) */ if(best_hits_array != 0){ s_free(best_hits_array); } best_hits_array = (hit*)s_malloc((size_t)(length * sizeof(hit))); best_hits_array_len = length; } static void destroy_best_hits_array _AP((void)); static void destroy_best_hits_array() { s_free(best_hits_array); best_hits_array_len = 0; } void clear_best_hits_array() /* side effects the best_hits_array. XXX could use memset */ { memset((char*)best_hits_array, 0, best_hits_array_len * sizeof(hit)); } /* for debugging purposes */ void print_best_hits() { long i; for( i = 0; i < best_hits_array_len; i++){ if (best_hits_array[i].weight != 0) { printf("Best hit %ld: weight %lf, doc_id %ld, headline %s, filename %s, lines %ld\n", i, best_hits_array[i].weight, best_hits_array[i].document_id, best_hits_array[i].headline, best_hits_array[i].filename, best_hits_array[i].number_of_lines); } } } void sort_best_hits(db) database * db; { /* returns nothing. * side effects best_hits and document_score_array */ long i, doc; double worst_weight_to_make_it = 0.0; document_table_entry doc_entry; long best_hit_number = 0; /* snuff the scores */ for(i = 0; i < max_hit_retrieved; i++){ best_hits_array[i].weight = 0.0; } /* loop over the doc, and keep the doc_id and weight in best hit table */ /* for(doc = 0; doc < db->doc_table_allocated_entries; doc++){ */ for(doc = doc_start; doc <= doc_end; doc++) { double weight = document_score_array[doc]; /* jmf */ if(weight > 0) { #ifndef NEW_WEIGHT /* tung, 5/94 */ read_document_table_entry(&doc_entry, doc, db); /* if this could be removed, we'd gain speed */ if (doc_entry.document_length) weight/=doc_entry.document_length; else weight = 0; #endif if(worst_weight_to_make_it < weight){ /* merge it into the best_hits array. start at the bottom */ for(i = (max_hit_retrieved - 1); i >= 0; i--){ if(weight > best_hits_array[i].weight /* && (check_document_id(doc, db) == true) too slow.*/ ){ /* move this entry down */ if((i + 1) < max_hit_retrieved){ best_hits_array[i+1].weight = best_hits_array[i].weight; best_hits_array[i+1].document_id = best_hits_array[i].document_id; } best_hits_array[i].document_id = doc; best_hits_array[i].weight = weight; } else break; } } } } doc_start = doc_end = 0; /* tung, 5/94 */ for(i = 0; i < max_hit_retrieved; i++){ if(best_hits_array[i].weight <= 0.0) return; if (read_document_table_entry(&doc_entry, best_hits_array[i].document_id, db) == true){ best_hits_array[best_hit_number].weight = best_hits_array[i].weight; best_hits_array[best_hit_number].document_id = best_hits_array[i].document_id; best_hits_array[best_hit_number].start_character = doc_entry.start_character; best_hits_array[best_hit_number].end_character = doc_entry.end_character; best_hits_array[best_hit_number].document_length = doc_entry.document_length; best_hits_array[best_hit_number].number_of_lines = doc_entry.number_of_lines; sprintf(best_hits_array[best_hit_number].date, "%d", doc_entry.date); read_filename_table_entry(doc_entry.filename_id, best_hits_array[best_hit_number].filename, best_hits_array[best_hit_number].type, NULL, db), strncpy(best_hits_array[best_hit_number].headline, read_headline_table_entry(doc_entry.headline_id,db), MAX_HEADLINE_LEN); best_hit_number++; } beFriendly(); } for(i = best_hit_number; i < max_hit_retrieved; i++){ best_hits_array[best_hit_number].weight = 0.0; } /* print_best_hits(s); for debugging */ } /* returns the next best hit */ long best_hit(db, doc_id, best_character, best_line, score,start,end,date, length,nlines,headline,filename,type) database *db; long *doc_id; long *best_character; long *best_line; double *score; long *start,*end,*date,*length,*nlines; char *headline,*filename,*type; { double tmp; *best_character = 0; *best_line = 0; #ifdef BOOL if (currentQuery != NULL) /* for boolean */ { send(currentQuery,GetBestHit,db,doc_id,best_character,best_line,score); if (*doc_id > 0) return(0); /* ok */ else return(-1); /* no more docs */ } #endif /* BOOL */ if(current_best_hit > best_hits_array_len) return(1); if(best_hits_array[current_best_hit].weight == 0.0) return(1); *doc_id = best_hits_array[current_best_hit].document_id; tmp = ((double)(best_hits_array[current_best_hit].weight*VALUE)); *score=tmp; *start=best_hits_array[current_best_hit].start_character; *end=best_hits_array[current_best_hit].end_character; *date=atol(best_hits_array[current_best_hit].date); *length=best_hits_array[current_best_hit].document_length; *nlines=best_hits_array[current_best_hit].number_of_lines; strcpy(headline,best_hits_array[current_best_hit].headline); strcpy(filename,best_hits_array[current_best_hit].filename); strcpy(type,best_hits_array[current_best_hit].type); current_best_hit++; return(0); } long finished_best_hit(db) database *db; { #ifdef BOOL if (currentQuery != NULL) /* for boolean */ { send(currentQuery,Delete); currentQuery = NULL; return(0); } #endif /* BOOL */ /* if we are on a small machine, we might want to destroy_document_score_array */ clear_document_score_array(); clear_best_hits_array(); current_best_hit = 0; return(0); } /*=============================* *=== Searching for words ===* *=============================*/ /* see irext.h for doc */ long init_search_word (db) database* db; { char fn[256]; strcpy( fn,db->database_file ); strcat( fn,synonym_ext ); syn_ReadFile( fn,&db->syn_Table,&db->syn_Table_Size ); return(0); } #ifdef NESTED_BOOLEANS /* tung, 10/93 */ extern long number_of_operands ; /* tung, 10/93 */ #endif #ifdef BOOLEANS static boolean gLastAnd= false; static boolean gLastNot= false; #endif /* see irext.h for doc */ long search_word(word, #ifdef FIELDS /* tung, 5/94 */ field_name, #endif char_pos, line_pos, weight, doc_id, word_pair, db) char *word; /* the word to be searched for */ #ifdef FIELDS /* tung, 5/94 */ char *field_name; #endif long char_pos; /* the position of the start of the word */ long line_pos; /* is this needed? not for signature system */ long weight; /* how important the word looks syntactically, such as is it bold */ long doc_id; /* current document, seed words is 0, then it increments into the relevant document */ long word_pair; database *db; { /* this side effects the document_score_array, * and downcases the word. * Returns 0 if successful or word not present, * returns non-0 if an error. * */ long not_full_flag = INDEX_BLOCK_FULL_FLAG; /* start out full so it will go on looking */ long count, index_block_size; long internal_document_id, number_of_valid_entries; double internal_weight; long index_file_block_number; long number_of_occurances; FOUR_BYTE index_buffer_data[INDEX_ELEMENT_SIZE*(1024/4)]; char *index_buffer; #ifdef undef char *i = index_buffer; /* What the hell should be in i ? (up) */ #endif FILE *stream = NULL; #ifdef LITERAL long txt_pos, icnt, wcnt, pcnt; /* 2/92 GS TLG */ document_table_entry doc_entry; /* 2/92 GS TLG */ static FILE *txt_stream = NULL; /* 2/92 GS TLG */ char cmpr_word[MAX_PHRASE_LENGTH + 1]; /* 2/92 GS TLG */ /*char phrase[MAX_PHRASE_LENGTH + 1]; */ /* 2/92 GS TLG */ char txt_filename[MAX_FILENAME_LEN + 1]; /* 2/92 GS TLG */ char *temp_txt_filename = NULL; /* francois */ char prev_txt_filename[MAX_FILENAME_LEN + 1]; /* 2/92 GS TLG */ char txt_type[MAX_TYPE_LEN + 1]; /* 2/92 GS TLG */ long phraselen= 0, txt_pos_fix= 0; char *document_section = NULL; /* tung , 10/93 */ long document_section_len = 0; /* tung , 10/93 */ long phrase_readed = 0; /* tung , 10/93 */ long phrase_count = 0; /* tung , 10/93 */ boolean phrase_found = false; /* tung , 10/93 */ #endif #ifdef NESTED_BOOLEANS /* tung, 10/93 */ long numeric_partial_valid_entries = 0; #endif #ifdef NEW_WEIGHT /* tung, 5/94 */ double query_wgt; #else double idf; #endif #ifdef FIELDS /* tung, 12/93 */ long field_id = -1; boolean SearchField = false; #endif /* do synonym conversion */ /* in theory, one can replace a word with a boolean phrase */ char *newword; newword = lookup_Synonym( word,db->syn_Table,db->syn_Table_Size ); waislog(WLOG_HIGH,WLOG_INFO,"Word %s Syn %s",word,newword); strncpy(word,newword,MAX_WORD_LENGTH); #ifdef FIELDS /* tung, 12/93 */ if(db->number_of_fields > 0) { if(*field_name != '\0') { if(strcmp(field_name, FREE_TEXT_FIELD) == 0) { /* global database */ field_name = "\0"; SearchField = false; field_id = -1; } else { SearchField = true; field_id = pick_up_field_id(field_name, db); } } } #endif /* tung, 10/93 */ #ifdef NESTED_BOOLEANS if(number_of_operands > 1) { make_search_result_array(number_of_operands); if((weight!=LITERAL_FLAG) && IsOperator(word)) { boolean_operations(word, search_result_array); return(0); } if(strlen(word) == 1) { search_result_array[operand_id].number_of_hits = 0; search_result_array[operand_id].operand_id = operand_id; if(!save_operand_id(operand_id, search_result_array, db->doc_table_allocated_entries)) return(-1); ++operand_id; return(0); } } #endif /* tung, 10/93 */ /* francois - call the stemmer */ #ifdef FIELDS /* tung, 1/94 */ if(weight!=LITERAL_FLAG && weight!= FIELD_FLAG && weight!= NUMERIC_FLAG) { #ifdef STEM_WORDS if(field_id > -1) { if(db->fields[field_id].stemming) stemmer(word); } else { if(db->stemming) stemmer(word); } #endif } #else #ifdef LITERAL if (weight!=LITERAL_FLAG) { stemmer(word); } #else stemmer(word); #endif #endif #ifdef LITERAL if (weight==LITERAL_FLAG) { /* goto after_booleans */ /* printf("search_word: literal word is [%s]\n", word); */ } else #endif #ifndef NESTED_BOOLEANS /* 10,93 */ #ifdef BOOLEANS if (strcmp(word,BOOLEAN_AND)==0) { /* should be all lowercase cmp here */ gLastAnd= true; return(0); } else if (strcmp(word,BOOLEAN_NOT)==0) { /* ^^ this is bad if we intersperse "not"s in a query -- docs found after not word may include notted word -- need to go back to doing not words after others -- but need now to check for literal string first */ gLastNot= true; return(0); } if (weight == BOOLEAN_NOT_FLAG) gLastNot= true; #else ; /* if not LITERAL_FLAG */ #endif #endif /* #ifndef NESTED_BOOLEANS */ index_buffer = (char*)index_buffer_data; #ifdef LITERAL if (weight==LITERAL_FLAG) { /* note: we found the first word of phrase once in map_over_words, but i'm too lazy to put another parameter in that cascade of function calls it takes to get here. */ char word1[MAX_WORD_LENGTH + 1]; register int i, len; register boolean more; phraselen= MINIMUM( MAX_PHRASE_LENGTH, strlen(word)); len = MINIMUM( MAX_WORD_LENGTH, phraselen); for (i=0, more=true; i < len && more; ) { word1[i] = word[i++]; #ifdef BIO more= (wordDelimiter(word[i]) == NOT_DELIMITER); #else more= (isalnum(word[i])); #endif } word1[i]= '\0'; txt_pos_fix= strlen(word1) + 1; /* printf("search_word: literal word1 is [%s]\n", word1); */ #ifdef FIELDS /* tung, 1/94 */ if((db->number_of_fields == 0) && !SearchField) index_file_block_number = look_up_word_in_dictionary(word1, &number_of_occurances, db); else index_file_block_number = field_look_up_word_in_dictionary(field_name, word1, &number_of_occurances, db); #else index_file_block_number = look_up_word_in_dictionary(word1, &number_of_occurances, db); #endif } else #endif /* LITERAL */ #ifdef PARTIALWORD #ifdef FIELDS /* tung, 1/94 */ index_file_block_number = look_up_partialword_in_dictionary(field_name, word, &number_of_occurances, db); #else index_file_block_number = look_up_partialword_in_dictionary(word, &number_of_occurances, db); #endif #else index_file_block_number = look_up_word_in_dictionary(word, &number_of_occurances, db); #endif current_best_hit = 0; /* so that the best hits willstart from 0 */ /* check the document_score_array */ if(document_score_array_len < db->doc_table_allocated_entries) make_document_score_array(db->doc_table_allocated_entries); if(index_file_block_number >= 0){ #ifdef PARTIALWORD while(index_file_block_number > 0){ /* dgg, need 2nd loop here for multiple partwords */ #endif #ifdef FIELDS /* tung, 1/94 */ if(SearchField && *field_name != '\0') stream = db->field_index_streams[pick_up_field_id(field_name, db)]; else stream = db->index_stream; #else stream = db->index_stream; #endif while((not_full_flag != INDEX_BLOCK_NOT_FULL_FLAG) && (index_file_block_number != 0)){ /* read the index block */ if (0 != fseek(stream, (long)index_file_block_number, SEEK_SET)) { waislog(WLOG_HIGH, WLOG_ERROR, "fseek failed into the inverted file to position %ld", (long)index_file_block_number); #ifdef BOOLEANS gLastNot= gLastAnd= false; #endif return(-1); } /* read(fileno(stream),index_buffer,INDEX_BLOCK_HEADER_SIZE); ASSIGN(not_full_flag, INDEX_BLOCK_FLAG_SIZE, index_buffer, INDEX_BLOCK_HEADER_SIZE, 0 ); ASSIGN(index_file_block_number,NEXT_INDEX_BLOCK_SIZE, index_buffer+INDEX_BLOCK_FLAG_SIZE, INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_FLAG_SIZE); ASSIGN(index_block_size,INDEX_BLOCK_SIZE_SIZE, index_buffer+INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE, INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE); this is equivalent, but slower: */ not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream); index_file_block_number = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream); index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream); /* Jim's debug code commented out printf("flag = %d, block_num = %d, block_size = %d\n", not_full_flag, index_file_block_number, index_block_size); */ fflush(stdout); if(EOF == index_block_size) { waislog(WLOG_HIGH, WLOG_ERROR, "reading from the index file failed"); #ifdef BOOLEANS gLastNot= gLastAnd= false; #endif return(-1); } if(not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG){ /* not full */ number_of_valid_entries = index_file_block_number; } else if(not_full_flag == INDEX_BLOCK_FULL_FLAG){ /* full */ number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE; } else{ /* bad news, file is corrupted. */ waislog(WLOG_HIGH, WLOG_ERROR, "Expected the flag in the inverted file to be valid. it is %ld", not_full_flag); #ifdef BOOLEANS gLastNot= gLastAnd= false; #endif return(-1); } /* printf("number of valid bytes: %ld\n", number_of_valid_entries); */ /* add the array to the document_score_array */ number_of_valid_entries /= INDEX_ELEMENT_SIZE; /* tung, 10/93 */ #ifdef NESTED_BOOLEANS if((number_of_operands > 1) && (search_result_array != NULL)) { #ifdef FIELDS /* tung, 1/94 */ if(weight != NUMERIC_FLAG && weight != PARTIAL_FLAG) { #else if(weight != PARTIAL_FLAG) { #endif if(!make_doc_ids_array(operand_id, db->doc_table_allocated_entries)) return(-1); search_result_array[operand_id].number_of_hits = number_of_valid_entries; } } #endif /* tung, 10/93 */ #ifdef NEW_WEIGHT /* tung, 5/94 */ query_wgt = 1; #else /* ses - idf is a fist approximation to the inverse document freq. */ /* what it actually is is the inverse occurance frequency which says * that the significance of a word is inversly proportional to the number * of times it occurs in the database */ idf=1.0/number_of_occurances; #endif for(count=0;count < number_of_valid_entries;count++) { int wgt; int did; /* if(count%1024 == 0) { read(fileno(stream),index_buffer,INDEX_ELEMENT_SIZE* MINIMUM(1024,number_of_valid_entries-count)); i=index_buffer; } */ did = read_bytes(DOCUMENT_ID_SIZE, stream); (void)read_bytes(WORD_POSITION_SIZE, stream); txt_pos=read_bytes(CHARACTER_POSITION_SIZE, stream); wgt = read_bytes(WEIGHT_SIZE,stream); #ifdef NEW_WEIGHT /* tung, 5/94 */ internal_weight = read_weight_from_stream(NEW_WEIGHT_SIZE, stream); #endif /* ASSIGN(wgt,WEIGHT_SIZE, i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE, INDEX_ELEMENT_SIZE, DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE); ASSIGN(did,DOCUMENT_ID_SIZE,i,INDEX_ELEMENT_SIZE,0); */ #ifdef LITERAL /* dgg -- is this proper update of read form to ASSIGN form ??*/ /* txt_pos = read_bytes(CHARACTER_POSITION_SIZE, stream);*/ /* 2/92 GS TLG */ if ((weight == LITERAL_FLAG) && (0 == doc_id)) { /* ASSIGN(txt_pos,CHARACTER_POSITION_SIZE,i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE, INDEX_ELEMENT_SIZE,DOCUMENT_ID_SIZE+WORD_POSITION_SIZE); */ /* printf("search_word: txtpos=%d, wgt=%d, did=%d\n", txt_pos, wgt, did); */ } #endif /* Commented out as suggested by Stan Isaacs at hp.com to come up with correct * weights when there are multiple documents in a file * * if(wgt>5L) * wgt-=5L; */ #ifndef NEW_WEIGHT /* tung, 5/94 */ internal_weight = log((double)wgt); internal_weight+=10.0; #endif internal_document_id = did; if((doc_start == 0) && (doc_end == 0)) /* tung, 5/94 */ doc_start = doc_end = did; /* tung, 5/94 */ doc_start = MINIMUM(doc_start, did); /* tung, 5/94 */ doc_end = MAXIMUM(doc_end, did); /* tung, 5/94 */ /* printf("entry %ld, Doc_id: %ld, weight %lf \n", count, internal_document_id, internal_weight); fflush(stdout); */ if(EOF == wgt) { waislog(WLOG_HIGH, WLOG_ERROR, "reading from the doc-id table failed"); #ifdef BOOLEANS gLastNot= gLastAnd= false; #endif return(-1); } #ifdef LITERAL if ((weight == LITERAL_FLAG) && (0 == doc_id)) { /* 2/92 GS TLG */ if (true == read_document_table_entry(&doc_entry, /* 2/92 GS TLG */ internal_document_id, db)) /* 2/92 GS TLG */ { /* 2/92 GS TLG */ read_filename_table_entry(doc_entry.filename_id, /* 2/92 GS TLG */ txt_filename, txt_type, NULL, db); /* 2/92 GS TLG */ /* printf("search_word: document is [%s]\n", txt_filename); */ if (NULL == txt_stream) { /* francois */ if (probe_file(txt_filename)) { txt_stream = s_fopen(txt_filename, "rb"); } else if (probe_file_possibly_compressed(txt_filename)) { temp_txt_filename = s_fzcat(txt_filename); if (temp_txt_filename) { txt_stream = s_fopen(temp_txt_filename, "rb"); } } strcpy(prev_txt_filename, txt_filename); } else if (0 != strcmp(txt_filename, prev_txt_filename)) { s_fclose(txt_stream); /* francois */ if ( temp_txt_filename != NULL ) { unlink(temp_txt_filename); s_free(temp_txt_filename); } if (probe_file(txt_filename)) { txt_stream = s_fopen(txt_filename, "rb"); } else if (probe_file_possibly_compressed(txt_filename)) { temp_txt_filename = s_fzcat(txt_filename); if (temp_txt_filename) { txt_stream = s_fopen(temp_txt_filename, "rb"); } } strcpy(prev_txt_filename, txt_filename); /* 2/92 GS TLG */ } txt_pos += doc_entry.start_character - txt_pos_fix; /* dgg */ document_section_len = doc_entry.end_character - txt_pos; /* tung, 10/93 */ s_fseek(txt_stream, txt_pos, SEEK_SET); /* 2/92 GS TLG */ document_section = (char*) s_malloc((size_t)((document_section_len+1)*sizeof(char))); /* tung, 10/93 */ fgets(document_section, document_section_len, txt_stream); /* tung, 10/93 */ phrase_readed = 0; /* tung, 10/93 */ phrase_readed += strlen(document_section); /* tung, 10/93 */ document_section = string_downcase(document_section); /* tung, 10/93 */ #if 0 fread(phrase, 1L, phraselen, txt_stream); /* 2/92 GS TLG */ /* { phrase[phraselen]= '\0'; printf("search_word: file phrase is [%s]\n", phrase); } */ if (0 != strncasecmp(word, phrase, phraselen)) /* 2/92 GS TLG */ internal_weight = 0.0; /* 2/92 GS TLG */ #endif if (NULL == strstr(document_section, word)) { /* tung, 10/93 */ while(phrase_readed < document_section_len) { /* tung, 10/93 */ fgets(document_section, document_section_len, txt_stream); /* tung, 10/93 */ phrase_readed += strlen(document_section); /* tung, 10/93 */ document_section = string_downcase(document_section); /* tung, 10/93 */ if(strstr(document_section, word) != NULL) { /* tung, 10/93 */ phrase_found = true; /* tung, 10/93 */ break; /* tung, 10/93 */ } /* tung, 10/93 */ } /* tung, 10/93 */ if(phrase_found == false) /* tung, 10/93 */ internal_weight = 0.0; /* tung, 10/93 */ phrase_found = false; /* tung, 10/93 */ } s_free(document_section); /* tung, 10/93 */ } } #endif #ifndef NESTED_BOOLEANS /* 10,93 */ #ifdef BOOLEANS if (gLastNot) { document_score_array[internal_document_id] = 0; /* printf("search_word: boolean 'not' scored\n"); */ } else #endif #endif /* #ifndef NESTED_BOOLEANS */ { /* if(doc_id > 0) we are doing a relevant document */ /* printf("wgt: %ld, internal weight: %lf, idf: %lf occurances: %ld\n", wgt,internal_weight, idf,number_of_occurances); fflush(stdout); */ #ifndef NEW_WEIGHT /* tung, 5/94 */ internal_weight*=idf; /* ses - for inverse doc. freq. */ #endif #ifndef NESTED_BOOLEANS #ifdef NEW_WEIGHT /* tung, 5/94 */ document_score_array[internal_document_id] += (query_wgt * internal_weight); #else document_score_array[internal_document_id] += (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight; #endif #else /* tung, 10/93 */ if(number_of_operands == 1) { #ifdef NEW_WEIGHT /* tung, 5/94 */ document_score_array[internal_document_id] += (query_wgt * internal_weight); #else document_score_array[internal_document_id] += (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight; #endif } else { if((number_of_operands > 1) && (search_result_array != NULL)) { if(weight == LITERAL_FLAG) { #ifdef NEW_WEIGHT /* tung, 5/94 */ ((search_result_array[operand_id]).doc_ids_array[phrase_count]).score += (query_wgt * internal_weight); #else ((search_result_array[operand_id]).doc_ids_array[phrase_count]).score += (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight; #endif if(((search_result_array[operand_id]).doc_ids_array[phrase_count]).score > 0) { ((search_result_array[operand_id]).doc_ids_array[phrase_count]).doc_id = internal_document_id; phrase_count++; search_result_array[operand_id].number_of_hits = phrase_count; } } #ifdef FIELDS /* tung, 1/94 */ else if(weight == NUMERIC_FLAG || weight == PARTIAL_FLAG) { #else else if(weight == PARTIAL_FLAG) { #endif if(NumPart_score_array[internal_document_id] <= 0) ++numeric_partial_valid_entries; #ifdef NEW_WEIGHT /* tung, 5/94 */ NumPart_score_array[internal_document_id] = MAXIMUM(NumPart_score_array[internal_document_id], (query_wgt * internal_weight)); #else NumPart_score_array[internal_document_id] += (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight; #endif } else { ((search_result_array[operand_id]).doc_ids_array[count]).doc_id = internal_document_id; #ifdef NEW_WEIGHT /* tung, 5/94 */ ((search_result_array[operand_id]).doc_ids_array[count]).score += (query_wgt * internal_weight); #else ((search_result_array[operand_id]).doc_ids_array[count]).score += (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight; #endif } } } #endif /* tung, 10/93 */ } /* printf("Score array: %lf\n",document_score_array[internal_document_id]); fflush(stdout); */ /* i+=INDEX_ELEMENT_SIZE; Purify (umr): uninitialized memory read: (up) */ } } #ifdef PARTIALWORD #ifdef FIELDS /* tung, 1/94 */ index_file_block_number = look_up_partialword_in_dictionary(field_name, NULL, &number_of_occurances, db); #else index_file_block_number = look_up_partialword_in_dictionary(NULL, &number_of_occurances, db); #endif } #endif #ifdef NESTED_BOOLEANS /* tung, 1/94 */ if(number_of_operands > 1) { long index = 0; #ifdef FIELDS /* tung, 1/94 */ if(weight == NUMERIC_FLAG || weight == PARTIAL_FLAG) { #else if(weight == PARTIAL_FLAG) { #endif if(!make_doc_ids_array(operand_id, db->doc_table_allocated_entries)) return(-1); search_result_array[operand_id].number_of_hits = numeric_partial_valid_entries; /*for (count=0; count < db->doc_table_allocated_entries; count++) {*/ for (count=doc_start; count <= doc_end ; count++) { if(NumPart_score_array[count] > 0) { ((search_result_array[operand_id]).doc_ids_array[index]).doc_id = count; ((search_result_array[operand_id]).doc_ids_array[index]).score = NumPart_score_array[count]; NumPart_score_array[count] = 0.0; ++index; } if(index == numeric_partial_valid_entries) break; } } } #endif /* tung, 10/93 */ #ifdef NESTED_BOOLEANS if((number_of_operands > 1) && (search_result_array != NULL)) { if(!save_operand_id(operand_id, search_result_array,db->doc_table_allocated_entries)) return(-1); search_result_array[operand_id].operand_id = operand_id; ++operand_id; } #endif /* tung, 10/93 */ #ifndef NESTED_BOOLEANS /* tung, 10/94 */ #ifdef BOOLEANS for (count=0; count < db->doc_table_allocated_entries; count++) { /* 12/91 GS TLG */ if (!gLastAnd) { /* 12/91 GS TLG */ prev_score_array[count] = document_score_array[count]; /* 12/91 GS TLG */ } /* 12/91 GS TLG */ else { /* 12/91 GS TLG */ if ((document_score_array[count] == prev_score_array[count]) /* 12/91 GS TLG */ || (prev_score_array[count] == 0)) { document_score_array[count] = 0; /* 12/91 GS TLG */ prev_score_array[count] = 0; /* 12/91 GS TLG */ } /* 12/91 GS TLG */ else { prev_score_array[count] = document_score_array[count]; /* 12/91 GS TLG */ } /* 12/91 GS TLG */ } /* 12/91 GS TLG */ } /* 12/91 GS TLG */ /* if (gLastAnd) printf("search_word: boolean `and' scored\n"); */ #endif #endif #ifdef BOOLEANS gLastNot= gLastAnd= false; #endif /* BOOLEANS */ return(0); } else if(0 == index_file_block_number){ /* an error occurred on looking up the word */ #ifdef BOOLEANS gLastNot= gLastAnd= false; #endif return(-1); } else { /* index_file_block_number is negative */ #ifdef NESTED_BOOLEANS /* tung, 10/93 */ if((number_of_operands > 1) && (search_result_array != NULL)) { if(!save_operand_id(operand_id, search_result_array,db->doc_table_allocated_entries)) return(-1); search_result_array[operand_id].operand_id = operand_id; search_result_array[operand_id].number_of_hits = 0; ++operand_id; } #else #ifdef BOOLEANS if (gLastAnd) for (count=0; count < db->doc_table_allocated_entries; count++) { document_score_array[count] = 0; prev_score_array[count] = 0; } gLastNot= gLastAnd= false; #endif #endif return(0); /* word not present */ } } /* now collect the best hits */ long finished_search_word(db) database *db; { #ifdef NESTED_BOOLEANS long number_of_hits; /* tung, 10/93 */ #endif #ifdef BOOL if (currentQuery != NULL) return; /* do nothing for boolean */ #endif /* def BOOL */ /* tung, 10/93 */ #ifdef NESTED_BOOLEANS if((number_of_operands > 1) && (search_result_array != NULL)) { number_of_hits = retriev_result(db->doc_table_allocated_entries, document_score_array); clear_search_result_array(&number_of_operands); } #endif /* tung, 10/93 */ /* check the document_score_array */ if(document_score_array_len < db->doc_table_allocated_entries) make_document_score_array(db->doc_table_allocated_entries); make_best_hits_array(max_hit_retrieved); sort_best_hits(db); syn_Free( db->syn_Table,&db->syn_Table_Size ); return(0); }